knitr::opts_chunk$set(echo = TRUE) set.seed(1)
## Load packages library(devtools) library(dplyr) library(ggplot2) library(cowplot) library(xtable) load_all('.') # library('gws') library(moiR) library(bigsnpr) library(bigstatsr)
Given a genotyped population of $N$ individuals and $M$ SNPs, we have the $X$ matrix. From $X$ this matrix we can obtain the covariance matrix $V$ between all SNPs as $\frac{1}{N} X^T X$, i.e. the linkage disequilibrium matrix. The diagonal of this matrix is $D$.
For this population of genotypes, we have a Normally distributed phenotype $Y$. Thus, a regular multivariate linear model can be written as $Y = X\beta + \epsilon$, and the $\beta$ is tipically solved in GWA studies marginally as $\beta_m = (X^TX)^{-1} X^T Y$. The problem comes when the different SNPs are not independent variables. In such case, marginal effects will not reflect true effects. Biologically, this is because of a linkage dissequilibrium confounder.
A trivial model extension can be done in the framework of multiple regression. The \beta effects can be estimated accounting for the $V$ covariance matrix as: $\beta = (X^TX)^{-1} V^{-1} X^T Y$. This can be also estimated directly from the marginal effects as: $\beta=V^{-1}D\beta_m$.
The above development requires a invertible $V$. This is not possible when there are SNPs in complete linkage or when the number of SNPs is larger than the number of individuals, because $V$ is rank deficient. In such case, an approximation of the inverted matrix could be achieved by several options. The algorithm of Higham to find the nearest positive definite matrix, would find, by definition, the closest invertible matrix. Choleschy factorization can help but not always. Finally, we can use the Moore-Penrose generalized inverse of a matrix.
### Real data # genomes<-readRDS("genome.rda") data(genomes) G <- genomes$genotypes X. = G[,sample(1:ncol(G),size = 250)] N=nrow(X.) M=ncol(X.) ### Dummy data Xdummy<-X. Xdummy[is.numeric(Xdummy)]<-sample(c(0,1,2),size = length(Xdummy),replace = TRUE) ### Decide real vs dummy X.<-Xdummy
hist(X., xlab="allele dossage",main="")
# Center and scale genome matrix X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x}) X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x}) # X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0 X = apply(X.,2, function(x) { mu=mean(x) sig=sd(x)+1e-10 (x-mu)/sig } ) # Check it worked # apply(X,2,mean) # apply(X,2,var)
# Calculate V= t(X) %*% X V= 1/N * t(X) %*% X D = diag(V) # Check it works, compared to base function in R plot(var(X),V)
Vinv=MASS::ginv(V) Vinv=solve(V)
set.seed(0) Y = rnorm(N,mean=1,sd=1) Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Gaussian phenotype")
b= solve(t(X)%*%X) %*% t(X) %*% Y b2=1/N * Vinv %*% t(X) %*% Y # b == b2 # round(b,digits = 2) == round(b2,digits = 2) # doublecheck bgwa= solve(t(X)%*%X) %*% solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization # Incorrect. This is multiple regression, so it generates partial regression coefficients. bgwa=sapply(1:M,function(m) solve(t(X[,m])%*%X[,m]) %*% t(X[,m]) %*% Y ) var_bgwa= sapply(1:M, function(m) solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*bgwa[m])) %*% (Y - (X[,m]*bgwa[m]))^2 # solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*bgwa[m])) %*% (Y - (X[,m]*bgwa[m])) ) sd_b=sapply(1:M, function(m) solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*b[m])) %*% (Y - (X[,m]*b[m])) ^2 # N*Vinv %*% t(Y - (X[,m]*b[m])) %*% (Y - (X[,m]*b[m])) ) # var_b= sd_b %*% Vinv lm plot(var_b,b) plot(var_bgwa,bgwa) # bgwa.lm=sapply(1:M,function(m) # Double check # lm(Y~X[,m])$coefficients[2] # ) # # var_bgwa.lm= sapply(1:M, # function(m) # sqrt(diag(vcov( lm(Y~X[,m]) )))[2] ^ 2 # ) bg.lm=lm(Y~X)$coefficients[-1] var_b.lm=diag(vcov( lm(Y~X) ))[-1] plot( var_b.lm, sd_b %*% Vinv ) plot(bg.lm,b) plot(var_b.lm,bg.lm) plot(sd_b,var_b.lm) plot(var_b,var_b.lm) MSE_bgwa= 1/N * sum( (Y - (X %*% bgwa))^2 ) plot(bgwa,var_bgwa) plot(bgwa.lm,var_bgwa.lm) MSE_b= 1/N * sum( (Y - (X %*% b))^2 ) bgwa2=1/N* solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization bgwa3= solve(t(X)%*%X) %*% solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization plot(bgwa,bgwa2) plot(bgwa,bgwa3) bgwa==bgwa2 bgwa==bgwa3 lm(Y~X[,1])$coefficients[2] bgwa[1] bgwa2[1] bgwa3[1] b[1] plot(b,bgwa3) plot(lm(Y~X[,1])$coefficients[2], bgwa[1]) # the diagonal matrix not necessary, just visualization boxplot(Y~X.[,1]) lm(Y~X[,1]) bgwa[1] b[1] b2= Vinv %*% bgwa b[1] b==b2 plot(b,b2) p0<-ggdotscolor(x=b,y=bgwa, ylab = expression(beta[gwa]), xlab = expression(beta) ) %>% addggregression(se=FALSE) p0 p1<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa])) p2<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta)) plot_grid(p0,plot_grid(p1,p2,ncol=2), ncol=1)
MSE= 1/N * sum( (Y - (X %*% b))^2 ) var_b= rep(MSE,M) %*% solve(t(X) %*% X) MSEgwa= 1/N * t(Y - (X %*% bgwa)) %*% (Y - (X %*% bgwa)) var_bgwa= solve(N*t(X)%*%X) %*% t(Y - (X %*% bgwa)) %*% (Y - (X %*% bgwa)) print("The MSE of marginal analysis is: ",MSEgwa) print("The MSE of conditional analysis is: ",MSE) # out=matrix(c(MSEgwa,MSE)) # rownames(out)<-c("bgwa","b") # colnames(out)<-"MSE" # tab <- xtable(out,digits=c(2,2)) # print(tab, type="html")
eff=rnorm(ncol(X),mean = 0,sd = 1) Y=X. %*% eff # this is right! Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Phenotype from Gaussian SNP effects") Y.gauss=Y
eff=rgamma(ncol(X),0.1,0.1) eff =eff /sd(eff) eff=eff* sample(c(-1,1),ncol(X),replace = TRUE) eff=sample(c(-1,0,1),replace = TRUE, prob = c(0.025,0.95,0.025),size = ncol(X)) Y=X. %*% eff # this is right! Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Phenotype from exponential SNP effects") Y.skewed=Y
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.gauss # check if dividing is needed by individual bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.gauss # the diagonal matrix not necessary, just visualization p1<-ggplot(data.frame(eff,b,bgwa)) + geom_point(aes(x=eff,y=bgwa),shape=1) + stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+ geom_point(aes(x=eff,y=b),shape=1,col="red") + stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+ ylab(expression(paste("estimated ",beta)))+ xlab("True effect")+ guides(color=FALSE) # p1 b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.skewed bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.skewed p2<-ggplot(data.frame(eff,b,bgwa)) + geom_point(aes(x=eff,y=bgwa),shape=1) + stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+ geom_point(aes(x=eff,y=b),shape=1,col="red") + stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+ ylab(expression(paste("estimated ",beta)))+ xlab("True effect")+ guides(color=FALSE) # p2 plot_grid(p1,p2,ncol=2,labels=c("Gaussian SNP effects","Exponential SNP effects"))
### Real data X. = G[,sample(1:ncol(G),size = 250)] N=nrow(X.) M=ncol(X.)
# Center and scale genome matrix X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x}) X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x}) # X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0 X = apply(X.,2, function(x) { mu=mean(x) sig=sd(x)+1e-10 (x-mu)/sig } )
# Calculate V = 1/N * t(X) %*% X D = diag(V) # Check it works, compared to base function in R plot(var(X),V)
Vinv=MASS::ginv(V)
eff=rnorm(ncol(X),mean = 0,sd = 1) Y=X. %*% eff # this is right! Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Gaussian SNP effects") Y.gauss=Y
eff=rgamma(ncol(X),0.1,0.1) eff =eff /sd(eff) eff=eff* sample(c(-1,1),ncol(X),replace = TRUE) eff=sample(c(-1,0,1),replace = TRUE, prob = c(0.025,0.95,0.025),size = ncol(X)) Y=X. %*% eff # this is right! Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Skewed SNP effects") Y.skewed=Y
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.gauss # check if dividing is needed by individual bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.gauss # the diagonal matrix not necessary, just visualization p1<-ggplot(data.frame(eff,b,bgwa)) + geom_point(aes(x=eff,y=bgwa),shape=1) + stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3))) + geom_point(aes(x=eff,y=b),shape=1,col="red") + stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+ ylab(expression(paste("estimated ",beta)))+ xlab("True effect")+ guides(color=FALSE) # p1 b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.skewed bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.skewed p2<-ggplot(data.frame(eff,b,bgwa)) + geom_point(aes(x=eff,y=bgwa),shape=1) + stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+ geom_point(aes(x=eff,y=b),shape=1,col="red") + stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+ geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+ ylab(expression(paste("estimated ",beta)))+ xlab("True effect")+ guides(color=FALSE) # p2 plot_grid(p1,p2,ncol=2,labels=c("Gaussian SNP effects","Exponential SNP effects"))
### Real data X. = G[,sample(1:ncol(G),size = 250)] N=nrow(X.) M=ncol(X.)
# Center and scale genome matrix X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x}) X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x}) # X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0 X = apply(X.,2, function(x) { mu=mean(x) sig=sd(x)+1e-10 (x-mu)/sig } )
# Calculate V = 1/N * t(X) %*% X D = diag(V) # Check it works, compared to base function in R plot(var(X),V)
Vinv=MASS::ginv(V)
Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Survival_fruit_mli")], by.y="id",all.x=T)$Survival_fruit_mli Y=relative(Y) Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Survival to reproduction (Madrid+drought)") Y.bad=Y Y.bad[is.na(Y.bad)]<-mean(Y.bad,na.rm=TRUE) # Mean impute Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Survival_fruit_thi")], by.y="id",all.x=T)$Survival_fruit_thi Y=relative(Y) Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Survival to reproduction (Tübingen+water)") Y.good=Y Y.good[is.na(Y.good)]<-mean(Y.good,na.rm=TRUE) # Mean impute
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.bad # check if dividing is needed by individual bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.bad # the diagonal matrix not necessary, just visualization p1=ggdotscolor(b,bgwa, ylab = expression(beta[gwa]), xlab = expression(beta) ) %>% addggregression(se=FALSE) p1=p1 + geom_hline(yintercept = 0,lty="dashed",size=0.2)+ geom_vline(xintercept = 0,lty="dashed",size=0.2)+ ggtitle("Madrid+Drought") p2<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa])) p3<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta)) plot_grid(p1,plot_grid(p2,p3,ncol=2), ncol=1) b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.good bgwa= solve(t(X)%*%X) %*% t(X) %*% Y.good p2=ggdotscolor(b,bgwa, ylab = expression(beta[gwa]), xlab = expression(beta) ) %>% addggregression(se=FALSE) p2=p2 + geom_hline(yintercept = 0,lty="dashed",size=0.2)+ geom_vline(xintercept = 0,lty="dashed",size=0.2) + ggtitle("Tübingen+Water") p4<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa])) p5<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta)) plot_grid(p2,plot_grid(p4,p5,ncol=2),rel_heights = 2,rel_widths = 1, ncol=1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.